library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.4.1
✔ readr 2.1.3 ✔ forcats 0.5.2 ── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
load("data_ml.RData")
data_ml %>%
filter(date > "1999-12-31",
date < "2019-01-01") %>%
arrange(stock_id, date)
NA
data_ml <- data_ml %>%
group_by(date) %>%
mutate(R1M_Usd_C = R1M_Usd > median(R1M_Usd),
R12M_Usd_C = R12M_Usd > median(R12M_Usd)) %>%
ungroup() %>%
mutate_if(is.logical, as.factor)
data_ml
stock_ids <- levels(as.factor(data_ml$stock_id))
stock_days <- data_ml %>%
group_by(stock_id) %>%
summarise(nb = n())
stock_ids_short <- stock_ids[which(stock_days$nb == max(stock_days$nb))]
returns <- data_ml %>%
filter(stock_id %in% stock_ids_short) %>%
dplyr::select(date, stock_id, R1M_Usd) %>%
pivot_wider(names_from = "stock_id",
values_from = "R1M_Usd")
returns
NA
This dataset comprises information on 1,207 stocks listed in the US (possibly originating from Canada or Mexico). The time range starts in November 1998 and ends in March 2019. For each point in time, 93 characteristics describe the firms in the sample.
Our first observation is regarding the size factor, which states that companies with small capitalization tends grant higher returns than large capitalization. For that we create an equally weighted portofolio and summarize the mean return.
data_ml %>%
group_by(date) %>%
mutate(large = Mkt_Cap_12M_Usd > median(Mkt_Cap_12M_Usd)) %>% # Creates a Large Cap colum
ungroup() %>%
mutate(year = lubridate::year(date)) %>% # Creates a year variable
group_by(year, large) %>%
summarize(avg_return = mean(R1M_Usd)) %>% # Compute the avg return
#Plot
ggplot(aes(x= year, y= avg_return, fill = large)) +
geom_col(position = "dodge") + theme_light() +
theme(legend.position = c(0.8, 0.2)) +
coord_fixed(124) + theme(legend.title = element_blank()) +
scale_fill_manual(values = c("#F87E1F", "#0570EA"), name = "",
labels = c("Small", "Large"))
`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
library(quantmod) # Data extraction
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:dplyr’:
first, last
Loading required package: TTR
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
library(xtable) # LaTex exports
min_date <- "1963-07-31"
max_date <- "2020-03-28"
temp <- tempfile()
kf_website <- "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/"
kf_file <- "ftp/F-F_Research_Data_5_Factors_2x3_CSV.zip"
link <- paste0(kf_website, kf_file) # Linking the sites
download.file(link, temp, quiet = TRUE) #Download the site!
ff_factors <- read_csv(unz(temp, "F-F_Research_Data_5_Factors_2x3.csv"), skip = 2 ) %>%
rename(date = `...1`, MKT_RF = `Mkt-RF`) %>%
mutate_at(vars(-date), as.numeric) %>% # Convert valeus to numbers
mutate(date = ymd(parse_date_time(date, "%Y%m"))) %>% # Right format
mutate(date = rollback(date + months(1))) %>% # End of month date
mutate( MKT_RF = MKT_RF / 100,
SMB = SMB / 100,
HML = HML / 100,
RMW = RMW / 100,
CMA = CMA / 100,
RF = RF/100) %>%
filter(date >= min_date, date <= max_date)
New names:Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 775 Columns: 7── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): ...1, Mkt-RF, SMB, HML, RMW, CMA, RF
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Warning: NAs introduced by coercionWarning: NAs introduced by coercionWarning: NAs introduced by coercionWarning: NAs introduced by coercionWarning: NAs introduced by coercionWarning: NAs introduced by coercionWarning: 60 failed to parse.
ff_factors
NA
ff_factors %>%
mutate(date = year(date)) %>%
gather(key= factor, value = value, -date) %>%
group_by(date, factor) %>%
summarise(value = mean(value)) %>%
ggplot(aes(x = date, y = value, color = factor)) +
geom_line() + coord_fixed(500)
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
nb_factors <- 5
data_fm <- left_join(data_ml %>%
dplyr::select(date, stock_id, R1M_Usd) %>%
filter(stock_id %in% stock_ids_short),
ff_factors,
by = "date") %>%
group_by(stock_id) %>%
mutate(R1M_Usd = lag(R1M_Usd)) %>%
ungroup() %>%
na.omit() %>%
pivot_wider(names_from = "stock_id", values_from = "R1M_Usd")
models <- lapply(paste0("`", stock_ids_short,
'` ~ MKT_RF + SMB + HML + RMW + CMA'),
function(f){lm(as.formula(f), data= data_fm,
na.action = "na.exclude") %>%
summary() %>%
"$"(coef) %>%
data.frame() %>%
dplyr::select(Estimate)}
)
betas <- matrix(unlist(models), ncol = nb_factors + 1, byrow = T) %>%
data.frame(row.names = stock_ids_short)
colnames(betas) <- c("Constant", "MKT_RF", "SMB", "HML", "RMW", "CMA")
knitr::kable(head(betas), booktabs = TRUE,
caption = "Sample of beta values (row numbers are stock IDs)")
| Constant | MKT_RF | SMB | HML | RMW | CMA | |
|---|---|---|---|---|---|---|
| 3 | -0.0017164 | 0.8020773 | 0.8356087 | 0.8320418 | 0.1128188 | -0.2439281 |
| 4 | 0.0035508 | 0.3139552 | 0.2748537 | -0.1524879 | 0.4576615 | 0.4708250 |
| 7 | 0.0049663 | 0.5261486 | 0.5214059 | 0.0306593 | 0.3206683 | 0.3530033 |
| 9 | 0.0044245 | 0.7470505 | 0.6267144 | 1.0142096 | -0.0520735 | -0.0600390 |
| 16 | 0.0011722 | 1.1883735 | -0.1612352 | 1.3812378 | 0.1800120 | -0.6041161 |
| 22 | 0.0018610 | 0.5919958 | 0.5707547 | 0.3307818 | 0.4701422 | 0.1969087 |
betas
loadings <- betas %>%
dplyr::select(-Constant) %>%
data.frame()
ret <- returns %>%
dplyr::select(-date) %>%
data.frame(row.names = returns$date) %>%
t()
fm_data <- cbind(loadings, ret)
fm_data
models <- lapply(paste("`", returns$date, "`", ' ~ MKT_RF + SMB + HML + RMW + CMA', sep = ""),
function(f){ lm(as.formula(f), data = fm_data) %>% # Call lm(.)
summary() %>% # Gather the output
"$"(coef) %>% # Keep only the coefs
data.frame() %>% # Convert to dataframe
dplyr::select(Estimate)} # Keep only estimates
)
gammas <- matrix(unlist(models), ncol = nb_factors + 1, byrow = T) %>% # Switch to dataframe
data.frame(row.names = returns$date) # & set row names
colnames(gammas) <- c("Constant", "MKT_RF", "SMB", "HML", "RMW", "CMA") # Set col names
gammas
gammas[1:nrow(gammas),] %>% # Take gammas:
# The first row is omitted because the first row of returns is undefined
dplyr::select(MKT_RF, SMB, HML) %>% # Select 3 factors
bind_cols(date = data_fm$date) %>% # Add date
gather(key = factor, value = gamma,-date) %>%
ggplot(aes(x = date, y = gamma, color = factor)) +
geom_line() + facet_grid(factor~.)